import pandas as pd
import socket
import os
import yaml
import sys
import statsmodels.api as sm
from scipy import stats
import pyranges as pr
from pyliftover import LiftOver
import seaborn as sns
import scanpy as sc
from matplotlib import rcParams
import matplotlib.pylab as plt
import numpy as np
import anndata as ad
rcParams['figure.figsize'] = 11.7,8.27
outdir = "../data/output"
with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
colorsmap = dict(zip([i["newName"] for i in iPSC_lines_map.values()],[i["color"] for i in iPSC_lines_map.values()]))
figDir = "./figures"
if not os.path.exists(figDir):
# Create a new directory because it does not exist
os.makedirs(figDir)
vcfPath = "../data/jointGenotype.g.vcf"
#Path to cellranger gex-GRCh38-2020-A gtf
gtfPath = "../data/resources/genes.gtf"
#Path to kolf from HIPSCI
kolfGTRaw = "../data/resources/HPSI0114i-kolf_2.wgs.gatk.haplotype_caller.20170425.genotypes.vcf.gz"
# Path were BULKrna / WGS matched loci will be written
kolfWGS_Matched_loci = outdir+"/WGS_Matched_loci.kolf.tsv"
hostRoot = "-".join(socket.gethostname().split('-')[0:2])
with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f: paths = yaml.load(f, Loader=yaml.FullLoader)
indir=paths["paths"]["indir"][hostRoot] outdir=paths["paths"]["outdir"][hostRoot] projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot] markers=paths["paths"]["markers"][hostRoot] score_genes_mod_path=paths["paths"]["score_genes_mod"][hostRoot]
with open(projectBaseDir+"/iPSC_lines_map.yaml", 'r') as f: iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
with open(markers, 'r') as f: markers = yaml.load(f, Loader=yaml.FullLoader)["markers"]["CBOs"]
sys.path.append(score_genes_mod_path) from score_genes_mod import *
vcfPath = "../data/jointGenotype.g.vcf" kolfGT = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing_io/data/KOLF.WGS.unflitered.hg38.bed" kolfGTRaw = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing_io/data/external_genotypes/HPSI0114i-kolf_2.wgs.gatk.haplotype_caller.20170425.genotypes.vcf.gz" kolfWGS_Matched_loci = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing_io/data/external_genotypes/WGS_Matched_loci.kolf.tsv" acghPath = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing_io/data/external_genotypes/GSA2019_414_025_v2_FinalReport.headless.txt" outDF = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing/downstreamAnalysis/ASE_FOCUS/WGSmappings.tsv" outCumulativeASE = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing/downstreamAnalysis/ASE_FOCUS/CumulativeASE.tsv" outMappedVariants = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing/downstreamAnalysis/ASE_FOCUS/MappedVariants.tsv"
def liftOverDF(inDF, chrKey,posKey,indexerKey, startGenome, targetGenome):
lo = LiftOver(startGenome, targetGenome)
Liftedvariants = []
__inDF = inDF.copy()
if not __inDF[chrKey].head(1).tolist()[0].startswith("chr"):
chrRoot = False
__inDF[chrKey] = "chr"+__inDF[chrKey].astype(str)
else:
chrRoot = True
for row in __inDF.index.tolist():
pos = __inDF.loc[row]
posLiftedStart = lo.convert_coordinate(str(pos[chrKey]), pos[posKey])
if (len(posLiftedStart) == 1) & (posLiftedStart is not None):
Liftedvariants.append([posLiftedStart[0][0], posLiftedStart[0][1], row])
LiftedEnhancers = pd.DataFrame(Liftedvariants, columns=["lifted_"+chrKey,"lifted_"+posKey,"indexRow"])
LiftedEnhancers.index = LiftedEnhancers["indexRow"]
outDF = pd.merge(__inDF,LiftedEnhancers, how="inner", left_index=True, right_index=True)
outDF["old_"+chrKey+"_"+str(startGenome)] = outDF[chrKey].astype(str).str.split("chr", expand = True)[1] if not chrRoot else outDF[chrKey]
outDF["old_"+posKey+"_"+str(startGenome)] = outDF[posKey]
outDF[chrKey+"_"+str(targetGenome)] = outDF["lifted_"+chrKey].astype(str).str.split("chr", expand = True)[1] if not chrRoot else outDF["lifted_"+chrKey].astype(str)
outDF[posKey+"_"+str(targetGenome)] = outDF["lifted_"+posKey]
outDF["old_"+indexerKey+"_"+str(startGenome)] = outDF[indexerKey].str.split("chr", expand = True)
outDF[indexerKey+"_"+str(targetGenome)] = outDF[chrKey+"_"+str(targetGenome)]+"_"+outDF["lifted_"+posKey].astype(str)
outDF.drop(columns=["lifted_"+chrKey,"lifted_"+posKey,chrKey,posKey,indexerKey], inplace=True)
return(outDF)
def get_vcf_names(vcf_path):
with open(vcf_path, "rt") as ifile:
for line in ifile:
if line.startswith("#CHROM"):
vcf_names = [x for x in line.split('\t')]
break
ifile.close()
return vcf_names
names = [i.replace("#","").replace('\n','') for i in get_vcf_names(vcfPath)]
mappings = {k: v for d in [{k:iPSC_lines_map[k]["newName"]} for k in list(iPSC_lines_map.keys())] for k, v in d.items()}
vcf = pd.read_csv(vcfPath, sep="\t", skip_blank_lines=True,comment='#', names=names).rename(columns = mappings)
MappedVars = vcf.copy()
MappedVars["Chromosome"] = "chr"+vcf["CHROM"].astype(str)
MappedVars["End"] = MappedVars["POS"]
MappedVars["Start"] = MappedVars["POS"]
MappedVars = MappedVars[["Chromosome","Start","End","CHROM"]]
MappedVars = pr.PyRanges(MappedVars)
MappedVars
#Prepare gtf for overlap, we exclude non genes features and keep only coding genes
gtf = pd.read_csv(gtfPath, sep="\t", comment='#', header=None)
gtf = gtf[gtf[2] == "gene"]
gtf['GENE'] = gtf[8].str.extract(r'gene_name "(.+?)";')
gtf['GENEtype'] = gtf[8].str.extract(r'gene_type "(.+?)";')
gtf = gtf[gtf['GENEtype'] == "protein_coding"]
gtf = gtf[[0,3,4,"GENE"]]
gtf.columns = ["Chromosome","Start","End","SYMBOL"]
gtf = pr.PyRanges(gtf)
gtf
MappedVars = MappedVars.join(gtf).df.drop_duplicates(subset=["Chromosome","Start","End"]).drop(columns=["Start_b","End_b"])
MappedVars.index = MappedVars["CHROM"].astype(str)+"_"+MappedVars["Start"].astype(str)
MappedVars["indexer"] = MappedVars.index.tolist()
vcf.index = vcf["CHROM"].astype(str)+"_"+vcf["POS"].astype(str)
/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3169: DtypeWarning: Columns (0) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
MappedVars
| Chromosome | Start | End | CHROM | SYMBOL | indexer | |
|---|---|---|---|---|---|---|
| 1_944296 | chr1 | 944296 | 944296 | 1 | SAMD11 | 1_944296 |
| 1_944307 | chr1 | 944307 | 944307 | 1 | SAMD11 | 1_944307 |
| 1_944531 | chr1 | 944531 | 944531 | 1 | SAMD11 | 1_944531 |
| 1_946247 | chr1 | 946247 | 946247 | 1 | NOC2L | 1_946247 |
| 1_946538 | chr1 | 946538 | 946538 | 1 | NOC2L | 1_946538 |
| ... | ... | ... | ... | ... | ... | ... |
| Y_12709406 | chrY | 12709406 | 12709406 | Y | USP9Y | Y_12709406 |
| Y_12914649 | chrY | 12914649 | 12914649 | Y | DDX3Y | Y_12914649 |
| Y_12914655 | chrY | 12914655 | 12914655 | Y | DDX3Y | Y_12914655 |
| Y_13251168 | chrY | 13251168 | 13251168 | Y | UTY | Y_13251168 |
| Y_13395391 | chrY | 13395391 | 13395391 | Y | UTY | Y_13395391 |
83720 rows × 6 columns
def aseByObs(adataVar, genoOBS, geno, threshold, fdr,vcf, celltypeOBS, mappedVariants,groupby=None,groups=None ):
"""
The whole function pivots around celltypes division, that is in our opinion the main covariate that cannot be neglected during ASE exploration
genoOBS = is the column containing the the second covariate to isolate
geno = is the genotype to isolate the analysis on (if want to do the analysis on several genotypes you have to run
multiple times with different geno value)
the name should be one of the columns of VCF file
* If your sample contains only 1 genotype you should still use genoOBS, geno arguments (provide mock column with same value)
* Instead if you want to aggregate from multiple genotype (ignoring genoOBS covariate isolation at your risk)
set genoOBS=yourGenoObs and geno=None
threshold = minReads for a locus to be tested
mappedVariants = secondary DF that contains complementary infos to vcf (rs, gene mappin etc)
"""
#First we keep only SNPs mapped to genes
vcf = vcf.loc[mappedVariants.index.tolist()]
if type(groups) != list:
groups = [groups]
if geno != None:
hetGeno=vcf.loc[vcf[geno].str.split(":", expand = True)[0] == '0/1',["CHROM","POS"]].astype(str).apply('_'.join, axis=1).tolist()
print(len(hetGeno))
# Select for ID, obs and Het loci from bulk
if groupby != None:
varAdata_DS_geno = varAdata[
((varAdata.obs[genoOBS] == geno) & (varAdata.obs[groupby].isin(groups))),
list(set(hetGeno).intersection(set(varAdata.var_names.tolist())))
]
else:
varAdata_DS_geno = varAdata[varAdata.obs[genoOBS] == geno,
list(set(hetGeno).intersection(set(varAdata.var_names.tolist())))
]
elif geno == None:
#Aggregate information from all genotypes after checking they are all het at the given locus
GenoAdataDict = {}
for g in varAdata.obs[genoOBS].unique():
hetGeno = vcf.loc[vcf[g].str.contains("0/1"),g].index.tolist()
hetGeno = list(set(hetGeno).intersection(set(varAdata.var_names.tolist())))
GenoAdataDict[g] = varAdata[varAdata.obs[genoOBS] == g,hetGeno]
varAdata_DS_geno = ad.concat(list(GenoAdataDict.values()), join="outer")
del GenoAdataDict
# isolate ref and Alt reads by leiden
# Sum by leiden
AltReadsDF = pd.DataFrame([pd.Series((varAdata_DS_geno[varAdata_DS_geno.obs[celltypeOBS] == cl].layers["AltReads"].sum(axis = 0).flatten().A1),index=varAdata_DS_geno.var_names) for cl in varAdata_DS_geno.obs[celltypeOBS].unique()], index=varAdata_DS_geno.obs[celltypeOBS].unique()).T
RefReadsDF = pd.DataFrame([pd.Series((varAdata_DS_geno[varAdata_DS_geno.obs[celltypeOBS] == cl].layers["RefReads"].sum(axis = 0).flatten().A1),index=varAdata_DS_geno.var_names) for cl in varAdata_DS_geno.obs[celltypeOBS].unique()], index=varAdata_DS_geno.obs[celltypeOBS].unique()).T
TotalDF = AltReadsDF + RefReadsDF
ASEdf = pd.DataFrame()
# Perform cluster-wise thresholding and test
for cl in TotalDF.columns:
MinCountsLoci = TotalDF[TotalDF[cl] > threshold].index.tolist()
testDF = pd.DataFrame({"AltReads":AltReadsDF.loc[MinCountsLoci,cl],
"RefReads":RefReadsDF.loc[MinCountsLoci, cl]})
#print(np.array(testDF))
result = testDF.apply(lambda x: stats.binom_test(
np.array([x["AltReads"],x["RefReads"]]),
p=.5),
axis=1)
if len(result) > 0:
result = result.to_frame(name="pVal")
print("Cluster ",cl)
print("Total tested loci :",testDF.shape[0])
print("NonCorrected p < 0.05 :",(result["pVal"] < 0.05).sum())
fdrCorr = sm.stats.multipletests(result["pVal"], method='fdr_bh', alpha=fdr, is_sorted=False)
result["Significant"] = fdrCorr[0]
result["FDR"] = fdrCorr[1]
print("Corrected FDR < 0.05 :",result["Significant"].sum())
clDF = result.merge(right=mappedVariants[["SYMBOL","indexer"]],
left_index=True,right_index=True )
print("""
""")
clDF["celltype"] = cl
clDF["AltReads"] = AltReadsDF.loc[clDF.index,cl]
clDF["RefReads"] = RefReadsDF.loc[clDF.index,cl]
clDF["Prop"] = clDF["AltReads"] / (clDF["AltReads"] + clDF["RefReads"])
ASEdf = pd.concat([ASEdf, clDF], ignore_index=True).sort_values("FDR", ascending = True)
print("Total tested loci :",len(ASEdf.indexer.unique()))
print("Total significant scored loci :",ASEdf.Significant.sum())
return(ASEdf)
varAdata = sc.read_h5ad(outdir+"/adatas/VarAnndata_complete.h5ad")
non_chimeric = varAdata.obs.loc[varAdata.obs["type"] == "non_chimeric","dataset"].unique().tolist()
print(non_chimeric)
chimeric = varAdata.obs.loc[varAdata.obs["type"] == "chimeric","dataset"].unique().tolist()
print(chimeric)
['DownD50', 'DownD100', 'DownD250'] ['UpD50', 'UpD300', 'UpD100_2', 'UpD100_1']
ASE = aseByObs(adataVar=varAdata,genoOBS="cellID_newName", geno="CTL08A",celltypeOBS="leidenAnnotated",threshold= 50,fdr=0.05 ,vcf=vcf,mappedVariants=MappedVars)
ASE
21543
Cluster Neurons
Total tested loci : 469
NonCorrected p < 0.05 : 181
Corrected FDR < 0.05 : 134
Cluster CajalR_like
Total tested loci : 357
NonCorrected p < 0.05 : 134
Corrected FDR < 0.05 : 106
Cluster RadialGliaProgenitors
Total tested loci : 247
NonCorrected p < 0.05 : 94
Corrected FDR < 0.05 : 64
Cluster GlutamatergicNeurons_early
Total tested loci : 558
NonCorrected p < 0.05 : 237
Corrected FDR < 0.05 : 185
Cluster ProliferatingProgenitors
Total tested loci : 343
NonCorrected p < 0.05 : 125
Corrected FDR < 0.05 : 100
Cluster MigratingNeurons
Total tested loci : 359
NonCorrected p < 0.05 : 127
Corrected FDR < 0.05 : 93
Cluster GlutamatergicNeurons_late
Total tested loci : 152
NonCorrected p < 0.05 : 59
Corrected FDR < 0.05 : 42
Cluster OuterRadialGliaAstrocytes
Total tested loci : 204
NonCorrected p < 0.05 : 77
Corrected FDR < 0.05 : 57
Cluster Interneurons
Total tested loci : 45
NonCorrected p < 0.05 : 20
Corrected FDR < 0.05 : 19
Total tested loci : 712
Total significant scored loci : 800
| pVal | Significant | FDR | SYMBOL | indexer | celltype | AltReads | RefReads | Prop | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000000e+00 | True | 0.000000e+00 | UBC | 12_124913268 | Neurons | 0.0 | 1166.0 | 0.000000 |
| 1 | 0.000000e+00 | True | 0.000000e+00 | UBC | 12_124913268 | GlutamatergicNeurons_early | 0.0 | 1297.0 | 0.000000 |
| 2 | 1.139238e-305 | True | 4.089864e-303 | UBC | 12_124913268 | MigratingNeurons | 0.0 | 1014.0 | 0.000000 |
| 3 | 2.078230e-274 | True | 7.419281e-272 | UBC | 12_124913268 | CajalR_like | 1.0 | 919.0 | 0.001087 |
| 4 | 2.491799e-206 | True | 8.546870e-204 | UBC | 12_124913268 | ProliferatingProgenitors | 0.0 | 684.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2594 | 9.110721e-01 | False | 1.000000e+00 | PNMA1 | 14_73712097 | RadialGliaProgenitors | 39.0 | 41.0 | 0.487500 |
| 2595 | 9.196068e-01 | False | 1.000000e+00 | DDX17 | 22_38483683 | RadialGliaProgenitors | 48.0 | 50.0 | 0.489796 |
| 2596 | 1.000000e+00 | False | 1.000000e+00 | AURKAIP1 | 1_1374025 | RadialGliaProgenitors | 147.0 | 146.0 | 0.501706 |
| 2598 | 1.000000e+00 | False | 1.000000e+00 | TOMM22 | 22_38684240 | GlutamatergicNeurons_early | 164.0 | 165.0 | 0.498480 |
| 2582 | 1.000000e+00 | False | 1.000000e+00 | CDK1 | 10_60794716 | ProliferatingProgenitors | 110.0 | 109.0 | 0.502283 |
2734 rows × 9 columns
LociTocheck = ASE.loc[ASE.Significant,"indexer"].unique().tolist()
LociTocheck = vcf.loc[LociTocheck,["REF","ALT", mappings["KOLF"]]].rename(columns={mappings["KOLF"]:"{}Genotype_bulkRNA".format(mappings["KOLF"]),
"REF":"bulkRNA_REF","ALT":"bulkRNA_ALT"})
LociTocheck.index.name = None
LociTocheck["chr"] = [i.split("_")[0] for i in LociTocheck.index.tolist()]
LociTocheck["pos"] = [int(i.split("_")[1]) for i in LociTocheck.index.tolist()]
LociTocheck["indexer"] = LociTocheck.index.tolist()
LociTocheck = liftOverDF(LociTocheck, "chr","pos","indexer", "hg38","hg19")
LociTocheck
LociTocheck[["chr_hg19","pos_hg19"]].to_csv(outdir+"/LociTocheckList.tsv", sep="\t", index=None, header=None)
lociTocheckPath = outdir+"/LociTocheckList.tsv"
## The WGS genotype file is big so we implemented the match via awk
%%bash -s "$lociTocheckPath" "$kolfGTRaw" "$kolfWGS_Matched_loci"
awk -v FS="\t" 'NR==FNR{a[$1,$2];next} ($1,$2) in a' $1 <(zcat $2 ) | cut -f1,2,3,4,5,6,7,10 > $3
chrom_wgsDF = pd.read_csv(kolfWGS_Matched_loci, header=None, sep="\t", names=["chr_wgs_hg19","pos_wgs_hg19","rs_wgs","ref_wgs","alt_wgs","q_wgs","filt_wgs","{}Genotype_WGS".format(mappings["KOLF"])])
chrom_wgsDF["indexer_hg19"] = chrom_wgsDF["chr_wgs_hg19"].astype(str)+"_"+chrom_wgsDF["pos_wgs_hg19"].astype(str)
chrom_wgsDF["alt_wgs"] = chrom_wgsDF["alt_wgs"].str.split(",", expand = True)[0].replace({"<NON_REF>":"."})
chrom_wgsDF
| chr_wgs_hg19 | pos_wgs_hg19 | rs_wgs | ref_wgs | alt_wgs | q_wgs | filt_wgs | CTL08AGenotype_WGS | indexer_hg19 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1309405 | rs150194697 | T | C | 474.77 | . | 0/1:11,14,0:25:99:503,0,359,536,401,936:6,5,4,10 | 1_1309405 |
| 1 | 1 | 3697035 | rs17411356 | A | G | 692.77 | . | 0/1:13,22,0:35:99:721,0,425,760,491,1251:6,7,14,8 | 1_3697035 |
| 2 | 1 | 10521200 | . | T | . | . | . | 0/0:31:76:31:0,76,1027 | 1_10521200 |
| 3 | 1 | 10521238 | . | T | . | . | . | 0/0:34:30:34:0,30,1164 | 1_10521238 |
| 4 | 1 | 12628188 | rs4580 | T | C | 490.77 | . | 0/1:13,14,0:27:99:519,0,433,558,475,1033:6,7,6,8 | 1_12628188 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 357 | 22 | 42908913 | . | A | . | . | . | 0/0:21:18:21:0,18,664 | 22_42908913 |
| 358 | 22 | 44220673 | rs138056 | C | A | 274.77 | . | 0/1:15,10,0:25:99:303,0,570,348,599,947:7,8,4,6 | 22_44220673 |
| 359 | 22 | 49146299 | rs695689 | G | A | 154.77 | . | 0/1:11,6,0:17:99:183,0,413,217,431,648:4,7,3,3 | 22_49146299 |
| 360 | 22 | 50962208 | rs12148 | T | G | 693.77 | . | 0/1:13,19,0:32:99:722,0,419,761,476,1237:4,9,5,14 | 22_50962208 |
| 361 | X | 24095220 | . | A | . | . | . | 0/0:20:57:20:0,57,855 | X_24095220 |
362 rows × 9 columns
LociTocheck = LociTocheck.merge(chrom_wgsDF, on="indexer_hg19", how = "left")
mergedDF = ASE[ASE.Significant].rename(columns={"indexer":"old_indexer_hg38"}).merge(LociTocheck, on="old_indexer_hg38")
mergedDF
mergeDF = mergedDF[["indexer_hg19","old_indexer_hg38","FDR","old_chr_hg38",
"old_pos_hg38","pos_hg19","chr_hg19", 'SYMBOL',
'celltype', 'AltReads', 'RefReads', 'Prop', 'bulkRNA_REF','bulkRNA_ALT', "{}Genotype_bulkRNA".format(mappings["KOLF"]),
'chr_wgs_hg19', 'pos_wgs_hg19', 'rs_wgs', 'ref_wgs', 'alt_wgs', "{}Genotype_WGS".format(mappings["KOLF"])]]
mergeDF = mergeDF.sort_values(["{}Genotype_WGS".format(mappings["KOLF"])],ascending=True, na_position='last').drop_duplicates('indexer_hg19', keep='first')
print("{} out of {} loci were not found in WGS".format(mergeDF.CTL08AGenotype_WGS.isna().sum(), mergeDF.shape[0]))
print("For rate of {} percent non mapping loci".format(round(mergeDF.CTL08AGenotype_WGS.isna().sum() / mergeDF.shape[0]*100,1)))
38 out of 400 loci were not found in WGS For rate of 9.5 percent non mapping loci
mergeDF[mergeDF.SYMBOL.isin(["UBC","GAPDH","ACTB"])][["SYMBOL","chr_hg19","chr_wgs_hg19","pos_hg19","pos_wgs_hg19","rs_wgs","CTL08AGenotype_bulkRNA","CTL08AGenotype_WGS","bulkRNA_REF","ref_wgs","bulkRNA_ALT","alt_wgs"]].dropna()
| SYMBOL | chr_hg19 | chr_wgs_hg19 | pos_hg19 | pos_wgs_hg19 | rs_wgs | CTL08AGenotype_bulkRNA | CTL08AGenotype_WGS | bulkRNA_REF | ref_wgs | bulkRNA_ALT | alt_wgs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 537 | ACTB | 7 | 7 | 5567112 | 5567112.0 | rs7612 | 0/1:650,524:1174:99:16596,0,17870 | 0/1:23,15,0:38:99:496,0,778,565,823,1387:11,12... | C | C | T | T |
| 164 | ACTB | 7 | 7 | 5567020 | 5567020.0 | rs138738998 | 0/1:494,457:951:99:15017,0,14961 | 0/1:23,17,0:40:99:534,0,831,602,882,1484:13,10... | C | C | T | T |
| 189 | UBC | 12 | 12 | 125397364 | 125397364.0 | rs8963 | 0/1:94,352:446:99:10810,0,2230 | 1/1:2,20,0:22:16:0|1:125397358_T_C:606,16,0,61... | A | A | G | G |
mergeDFUnique = mergeDF.drop_duplicates('indexer_hg19', keep='first')
mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] = mergeDFUnique["CTL08AGenotype_bulkRNA"].str.split(":", expand = True)[0]
mergeDFUnique["CTL08AGenotype_WGS_sankey"] = mergeDFUnique["CTL08AGenotype_WGS"].str.split(":", expand = True)[0]
mergeDFUnique = mergeDFUnique.fillna("NotCovered")
mergeDFUnique["CTL08AGenotype_WGS_sankey"] = "WGS_"+mergeDFUnique["CTL08AGenotype_WGS_sankey"]
mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] = "RNA_"+mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"]
import holoviews as hv
obsTupleToMap = ("CTL08AGenotype_bulkRNA_sankey","CTL08AGenotype_WGS_sankey")
SankeyDF=mergeDFUnique[list(obsTupleToMap)]
SankeyDF.columns = [obsTupleToMap[0],obsTupleToMap[1]]
SankeyDF = SankeyDF.groupby([obsTupleToMap[0],obsTupleToMap[1]]).size().reset_index().rename(columns={0:'count'})
SankeyDF=SankeyDF[SankeyDF["count"] != 0]
hv.extension('bokeh')
sankey1 = hv.Sankey(SankeyDF, kdims=[obsTupleToMap[0],obsTupleToMap[1]], vdims="count")
sankey1.opts(label_position='outer',
edge_color=obsTupleToMap[0], edge_line_width=0,
node_alpha=1.0, node_width=40, node_sort=False,
width=1100, height=700, bgcolor="white")
Concordant = mergeDF.loc[(mergeDF["CTL08AGenotype_bulkRNA"].str.split(":", expand = True)[0] == mergeDF["CTL08AGenotype_WGS"].str.split(":", expand = True)[0])]
Concordant["BaseEdit"] = Concordant["bulkRNA_REF"]+">"+Concordant["bulkRNA_ALT"]
Concordant = pd.DataFrame(Concordant.groupby(["BaseEdit"], as_index=False).size()).rename(columns={0:"counts"})
Concordant["Agreement"] = "concordant"
Discordant = mergeDF.loc[~(mergeDF["CTL08AGenotype_bulkRNA"].str.split(":", expand = True)[0] == mergeDF["CTL08AGenotype_WGS"].str.split(":", expand = True)[0])].dropna()
Discordant["BaseEdit"] = Discordant["bulkRNA_REF"]+">"+Discordant["bulkRNA_ALT"]
Discordant = pd.DataFrame(Discordant.groupby(["BaseEdit"], as_index=False).size()).rename(columns={0:"counts"})
Discordant["Agreement"] = "Discordant"
Unmapped = mergeDF.loc[mergeDF["CTL08AGenotype_WGS"].isna()]
Unmapped["BaseEdit"] = Unmapped["bulkRNA_REF"]+">"+Unmapped["bulkRNA_ALT"]
Unmapped = pd.DataFrame(Unmapped.groupby(["BaseEdit"], as_index=False).size()).rename(columns={0:"counts"})
Unmapped["Agreement"] = "Unmapped"
PieDF = pd.concat([Concordant, Discordant, Unmapped], axis = 0, ignore_index=True)
sankey1 = hv.Sankey(PieDF, kdims=["Agreement","BaseEdit"], vdims="size")
sankey1.opts(label_position='outer',
edge_color="BaseEdit", edge_line_width=1,
node_alpha=1.0, node_width=40, node_sort=False,
width=1100, height=700, bgcolor="white")
<ipython-input-1-c45f1868a562>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Concordant["BaseEdit"] = Concordant["bulkRNA_REF"]+">"+Concordant["bulkRNA_ALT"] <ipython-input-1-c45f1868a562>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Unmapped["BaseEdit"] = Unmapped["bulkRNA_REF"]+">"+Unmapped["bulkRNA_ALT"]
AllelerateDisCordant = mergeDFUnique.loc[(mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] == "RNA_0/1") & (mergeDFUnique["CTL08AGenotype_WGS_sankey"] == "WGS_0/0" ),"CTL08AGenotype_bulkRNA"].str.split(":", expand=True)[1]
AllelerateDisCordant = AllelerateDisCordant.to_frame("MAF")
AllelerateDisCordant["Concordant"] = "Discordant"
AllelerateDisCordant
AllelerateConcordant = mergeDFUnique.loc[(mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] == "RNA_0/1") & (mergeDFUnique["CTL08AGenotype_WGS_sankey"] == "WGS_0/1" ),"CTL08AGenotype_bulkRNA"].str.split(":", expand=True)[1]
AllelerateConcordant = AllelerateConcordant.to_frame("MAF")
AllelerateConcordant["Concordant"] = "Concordant"
AllelerateConcordant
DistributionAgreement = pd.concat([AllelerateConcordant,AllelerateDisCordant])
#DistributionAgreement.str.split(",", expand = True)[1].astype(int) / (Allelerate.str.split(",", expand = True)[0].astype(int)+Allelerate.str.split(",", expand = True)[1].astype(int))
DistributionAgreement["MAF"] =DistributionAgreement.MAF.str.split(",", expand = True)[1].astype(int) / (DistributionAgreement.MAF.str.split(",", expand = True)[0].astype(int)+DistributionAgreement.MAF.str.split(",", expand = True)[1].astype(int))
sns.histplot(DistributionAgreement, x= "MAF", hue="Concordant", bins=50)
<AxesSubplot:xlabel='MAF', ylabel='Count'>
AllelerateDisCordant = mergeDFUnique.loc[(mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] == "RNA_0/1") & (mergeDFUnique["CTL08AGenotype_WGS_sankey"] == "WGS_0/0" ),"Prop"]
AllelerateDisCordant = AllelerateDisCordant.to_frame("Prop")
AllelerateDisCordant["Concordant"] = "Discordant"
AllelerateConcordant = mergeDFUnique.loc[(mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] == "RNA_0/1") & (mergeDFUnique["CTL08AGenotype_WGS_sankey"] == "WGS_0/1" ),"Prop"]
AllelerateConcordant = AllelerateConcordant.to_frame("Prop")
AllelerateConcordant["Concordant"] = "Concordant"
AllelerateConcordant
DistributionAgreement = pd.concat([AllelerateConcordant,AllelerateDisCordant])
#DistributionAgreement.str.split(",", expand = True)[1].astype(int) / (Allelerate.str.split(",", expand = True)[0].astype(int)+Allelerate.str.split(",", expand = True)[1].astype(int))
sns.histplot(DistributionAgreement, x= "Prop", hue="Concordant", bins=50)
<AxesSubplot:xlabel='Prop', ylabel='Count'>
ASE2 = aseByObs(varAdata, genoOBS="cellID_newName" ,celltypeOBS="leidenAnnotated" , geno=None,threshold= 50,fdr=0.05 ,vcf=vcf,mappedVariants=MappedVars)
ASE2 = ASE2[ASE2.Significant]
Cluster CajalR_like
Total tested loci : 1189
NonCorrected p < 0.05 : 486
Corrected FDR < 0.05 : 381
Cluster Neurons
Total tested loci : 2472
NonCorrected p < 0.05 : 932
Corrected FDR < 0.05 : 724
Cluster Interneurons
Total tested loci : 988
NonCorrected p < 0.05 : 415
Corrected FDR < 0.05 : 340
Cluster GlutamatergicNeurons_early
Total tested loci : 1392
NonCorrected p < 0.05 : 550
Corrected FDR < 0.05 : 441
Cluster ProliferatingProgenitors
Total tested loci : 2211
NonCorrected p < 0.05 : 854
Corrected FDR < 0.05 : 655
Cluster MigratingNeurons
Total tested loci : 1298
NonCorrected p < 0.05 : 466
Corrected FDR < 0.05 : 337
Cluster OuterRadialGliaAstrocytes
Total tested loci : 1493
NonCorrected p < 0.05 : 543
Corrected FDR < 0.05 : 415
Cluster RadialGliaProgenitors
Total tested loci : 1611
NonCorrected p < 0.05 : 598
Corrected FDR < 0.05 : 467
Cluster GlutamatergicNeurons_late
Total tested loci : 2331
NonCorrected p < 0.05 : 869
Corrected FDR < 0.05 : 684
Total tested loci : 3718
Total significant scored loci : 4444
def ASEFlipTest(varAdata, ASEdf, minDev, minCounts ):
FinalDF = pd.DataFrame(index = ASEdf["celltype"].unique(), columns=["Loci_Het_in_Atl2Genotypes","Loci_Het_and_inconsistent_in_Atl2Genotypes"])
inconsistentLoci = {}
CountsDF = pd.DataFrame()
for cellType in ASEdf["celltype"].unique():
#for cellType in ["Neurons"]:
#print("{} not consistent loci".format(cellType))
ASEdfcl = ASEdf[ASEdf["celltype"] == cellType]
testDF = pd.DataFrame(index=ASEdfcl.indexer.unique().tolist())
for g in varAdata.obs.cellID_newName.unique():
p1 = vcf.loc[(ASEdfcl.indexer.unique().tolist())]
p1 = p1[p1[g].str.contains("0/1")].index.tolist()
t = varAdata[varAdata.obs.cellID_newName == g,p1]
total = (t.layers["RefReads"].sum(axis = 0).flatten().A1 + t.layers["AltReads"].sum(axis = 0).flatten().A1)
t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total)
t = pd.Series(t, index=p1)
t = t[total >= minCounts]
testDF[g] = t
#Print counts of incoherent loci per cell type
inconsistentLoci = testDF[(((testDF >= .5+minDev).sum(axis = 1) > 0) & ((testDF <= .5-minDev).sum(axis = 1)))].index.tolist()
LociToIDMap = testDF.loc[inconsistentLoci].apply(lambda row: row[row.notna()].index.tolist(), axis = 1).to_dict()
CountsDFlocal = pd.DataFrame(index = list(LociToIDMap.keys()))
CountsDFlocal["locus"] = LociToIDMap.keys()
CountsDFlocal["celltype"] = cellType
for i in LociToIDMap.items():
varAdataSS = varAdata[:,i[0]]
for g in i[1]:
#CountsDFlocal.loc[i[0],"{}_RefReads".format(g)] = varAdataSS[varAdataSS.obs.cellID_newName == g].layers["RefReads"].sum()
#CountsDFlocal.loc[i[0],"{}_AltReads".format(g)] = varAdataSS[varAdataSS.obs.cellID_newName == g].layers["AltReads"].sum()
Refsum = varAdataSS[varAdataSS.obs.cellID_newName == g].layers["RefReads"].sum()
AltSum = varAdataSS[varAdataSS.obs.cellID_newName == g].layers["AltReads"].sum()
totalSum = Refsum + AltSum
bVal = AltSum / totalSum
CountsDFlocal.loc[i[0],"{}_bval|counts".format(g)] = str(round(bVal,2))+"|"+str(round(totalSum))
CountsDF = pd.concat([CountsDF, CountsDFlocal], ignore_index=True)
#Store results in summary table
FinalDF.loc[cellType,"Loci_Het_in_Atl2Genotypes"] = (testDF.notna().sum(axis = 1) > 1).sum()
FinalDF.loc[cellType,"Loci_Het_and_inconsistent_in_Atl2Genotypes"] = (((testDF >= .5+minDev).sum(axis = 1) > 0) & ((testDF <= .5-minDev).sum(axis = 1))).sum()
CountsDF.set_index(["locus"]).merge(MappedVars,left_index=True,right_index=True ).iloc[:,[0,1,2,3,4,7]]
return CountsDF, FinalDF
AllelicCounts, SummaryDF = ASEFlipTest(varAdata, ASE2, minDev=.05, minCounts=20 )
<ipython-input-1-8fd354d71ee2>:26: RuntimeWarning: invalid value encountered in true_divide t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total) <ipython-input-1-8fd354d71ee2>:26: RuntimeWarning: invalid value encountered in true_divide t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total) <ipython-input-1-8fd354d71ee2>:26: RuntimeWarning: invalid value encountered in true_divide t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total) <ipython-input-1-8fd354d71ee2>:26: RuntimeWarning: invalid value encountered in true_divide t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total)
AllelicCounts
| locus | celltype | CTL01_bval|counts | CTL08A_bval|counts | CTL02A_bval|counts | CTL04E_bval|counts | |
|---|---|---|---|---|---|---|
| 0 | 22_48750487 | CajalR_like | 0.44|61 | 0.56|118 | NaN | NaN |
| 1 | 17_16342702 | Neurons | 0.55|2482 | 0.54|2310 | 0.43|9785 | NaN |
| 2 | 12_76027474 | Neurons | 0.04|93 | NaN | 0.74|810 | NaN |
| 3 | 1_161214307 | Neurons | NaN | 0.7|190 | 0.31|905 | NaN |
| 4 | 6_87529548 | Neurons | NaN | NaN | 0.44|315 | 0.56|27 |
| 5 | 2_97549448 | Neurons | 0.3|30 | NaN | 0.68|537 | NaN |
| 6 | 20_25452900 | Neurons | 0.42|137 | 0.5|207 | 0.57|440 | NaN |
| 7 | 16_70162281 | Neurons | 0.52|46 | 0.55|47 | 0.27|228 | NaN |
| 8 | 14_64052737 | Neurons | 0.45|47 | 0.6|40 | 0.45|883 | NaN |
| 9 | 2_28802613 | Neurons | 0.57|194 | 0.54|270 | 0.58|699 | 0.33|73 |
| 10 | 4_87529065 | OuterRadialGliaAstrocytes | 0.7|23 | NaN | 0.32|1171 | NaN |
| 11 | 16_70162281 | OuterRadialGliaAstrocytes | 0.52|46 | 0.55|47 | 0.27|228 | NaN |
| 12 | 14_69354229 | OuterRadialGliaAstrocytes | NaN | 0.61|281 | 0.45|350 | NaN |
| 13 | 1_161214307 | OuterRadialGliaAstrocytes | NaN | 0.7|190 | 0.31|905 | NaN |
| 14 | 17_16342702 | OuterRadialGliaAstrocytes | 0.55|2482 | 0.54|2310 | 0.43|9785 | NaN |
| 15 | 14_64052737 | OuterRadialGliaAstrocytes | 0.45|47 | 0.6|40 | 0.45|883 | NaN |
| 16 | 6_149632845 | OuterRadialGliaAstrocytes | 0.57|110 | NaN | 0.46|584 | 0.44|81 |
| 17 | 17_16342702 | ProliferatingProgenitors | 0.55|2482 | 0.54|2310 | 0.43|9785 | NaN |
| 18 | 1_161214307 | ProliferatingProgenitors | NaN | 0.7|190 | 0.31|905 | NaN |
| 19 | 11_65717866 | ProliferatingProgenitors | 0.6|62 | NaN | 0.51|138 | 0.44|43 |
| 20 | 11_116748357 | ProliferatingProgenitors | NaN | 0.58|159 | 0.44|474 | 0.59|82 |
| 21 | 12_120579368 | ProliferatingProgenitors | NaN | 0.52|306 | 0.57|670 | 0.44|158 |
| 22 | 19_17582871 | ProliferatingProgenitors | 0.4|94 | 0.58|113 | 0.53|225 | NaN |
| 23 | 11_62662863 | ProliferatingProgenitors | NaN | NaN | 0.44|373 | 0.67|36 |
| 24 | 22_21701105 | ProliferatingProgenitors | 0.56|110 | 0.61|77 | 0.43|366 | NaN |
| 25 | 17_16342702 | MigratingNeurons | 0.55|2482 | 0.54|2310 | 0.43|9785 | NaN |
| 26 | 2_97549448 | MigratingNeurons | 0.3|30 | NaN | 0.68|537 | NaN |
| 27 | 16_88715206 | MigratingNeurons | 0.43|168 | 0.65|167 | NaN | NaN |
| 28 | 2_37204503 | MigratingNeurons | 0.6|166 | 0.45|155 | NaN | NaN |
| 29 | 2_28802613 | RadialGliaProgenitors | 0.57|194 | 0.54|270 | 0.58|699 | 0.33|73 |
| 30 | 12_120579368 | RadialGliaProgenitors | NaN | 0.52|306 | 0.57|670 | 0.44|158 |
| 31 | 20_63247598 | RadialGliaProgenitors | 0.5|268 | 0.38|211 | 0.57|461 | NaN |
| 32 | 17_16342702 | GlutamatergicNeurons_late | 0.55|2482 | 0.54|2310 | 0.43|9785 | NaN |
| 33 | 12_76027474 | GlutamatergicNeurons_late | 0.04|93 | NaN | 0.74|810 | NaN |
| 34 | 2_97549448 | GlutamatergicNeurons_late | 0.3|30 | NaN | 0.68|537 | NaN |
| 35 | 1_161214307 | GlutamatergicNeurons_late | NaN | 0.7|190 | 0.31|905 | NaN |
| 36 | 11_45240575 | GlutamatergicNeurons_late | 0.57|82 | NaN | 0.36|282 | 0.61|77 |
| 37 | 2_197496910 | GlutamatergicNeurons_late | 0.32|59 | NaN | 0.82|158 | NaN |
| 38 | 9_83978399 | GlutamatergicNeurons_late | 0.37|49 | NaN | 0.54|778 | 0.61|56 |
| 39 | 16_15084681 | GlutamatergicNeurons_late | 0.75|20 | NaN | 0.45|253 | 0.59|34 |
| 40 | 18_9256260 | GlutamatergicNeurons_late | 0.57|30 | NaN | 0.41|329 | NaN |
| 41 | 13_110641500 | GlutamatergicNeurons_late | NaN | 0.59|41 | 0.39|324 | 0.58|38 |
| 42 | 2_97547721 | GlutamatergicNeurons_late | NaN | 0.38|77 | 0.59|138 | NaN |
| 43 | 3_25638247 | GlutamatergicNeurons_late | 0.53|60 | 0.79|53 | 0.43|998 | NaN |
| 44 | 11_117285836 | GlutamatergicNeurons_late | 0.34|29 | 0.74|23 | 0.46|318 | NaN |
| 45 | 13_52689848 | GlutamatergicNeurons_late | 0.64|22 | NaN | 0.34|139 | NaN |
| 46 | 11_116748357 | GlutamatergicNeurons_late | NaN | 0.58|159 | 0.44|474 | 0.59|82 |
| 47 | 17_16342702 | Interneurons | 0.55|2482 | 0.54|2310 | 0.43|9785 | NaN |
| 48 | 12_122865657 | Interneurons | 0.58|279 | NaN | 0.54|566 | 0.37|87 |
| 49 | 11_116748357 | Interneurons | NaN | 0.58|159 | 0.44|474 | 0.59|82 |
| 50 | 1_161214307 | Interneurons | NaN | 0.7|190 | 0.31|905 | NaN |
SummaryDF["cluster"] = SummaryDF.index
SummaryDF
| Loci_Het_in_Atl2Genotypes | Loci_Het_and_inconsistent_in_Atl2Genotypes | cluster | |
|---|---|---|---|
| CajalR_like | 165 | 1 | CajalR_like |
| Neurons | 291 | 9 | Neurons |
| OuterRadialGliaAstrocytes | 191 | 7 | OuterRadialGliaAstrocytes |
| GlutamatergicNeurons_early | 209 | 0 | GlutamatergicNeurons_early |
| ProliferatingProgenitors | 274 | 8 | ProliferatingProgenitors |
| MigratingNeurons | 164 | 4 | MigratingNeurons |
| RadialGliaProgenitors | 213 | 3 | RadialGliaProgenitors |
| GlutamatergicNeurons_late | 246 | 15 | GlutamatergicNeurons_late |
| Interneurons | 140 | 4 | Interneurons |
sns.barplot(data=SummaryDF.melt("cluster"), x="cluster", y="value", hue="variable")
plt.xticks(rotation=45)
(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), [Text(0, 0, 'CajalR_like'), Text(1, 0, 'Neurons'), Text(2, 0, 'OuterRadialGliaAstrocytes'), Text(3, 0, 'GlutamatergicNeurons_early'), Text(4, 0, 'ProliferatingProgenitors'), Text(5, 0, 'MigratingNeurons'), Text(6, 0, 'RadialGliaProgenitors'), Text(7, 0, 'GlutamatergicNeurons_late'), Text(8, 0, 'Interneurons')])